df <- as.data.frame(read_dta("./Cleaned_nuMoM2B_dataset_draft_8.31.2021.dta"))

a<-df %>% dplyr::select(numomid,pree_acog,pct_emptyc,d_totdens,p_totdens,f_totdens,p_seaplantdens,sodium_dens,g_nwhldens,g_whldens,v_totdens,f_soliddens,v_beangrendens,fatratio,heix10_sodium,momeduc,married,insurance,momrace4,prediab,prehtn,gravcat,v1_pregplanned,smokerpre,momage,bmiprepreg,dt_kcal)
a$pree_acog<-as.factor(a$pree_acog)
a$momeduc<-as.factor(a$momeduc)
a$married<-as.factor(a$married)
a$insurance<-as.factor(a$insurance)
a$momrace4<-as.factor(a$momrace4)
a$prediab<-as.factor(a$prediab)
a$prehtn<-as.factor(a$prehtn)
a$v1_pregplanned<-as.factor(a$v1_pregplanned)
a$smokerpre<-as.factor(a$smokerpre)
a$gravcat<-as.factor(a$gravcat)
#str(a)
#a
aggr(a)

Mode <- function (x, na.rm) {
    xtab <- table(x)
    xmode <- names(which(xtab == max(xtab)))
    if (length(xmode) > 1) xmode <- ">1 mode"
    return(xmode)
}

for (var in 1:ncol(a)) {
    if (class(a[,var])=="numeric") {
        a[is.na(a[,var]),var] <- mean(a[,var], na.rm = TRUE)
    } else if (class(a[,var]) %in% c("character", "factor")) {
        a[is.na(a[,var]),var] <- Mode(a[,var], na.rm = TRUE)
    }
}

aggr(a)

summary(a)
##    numomid          pree_acog   pct_emptyc        d_totdens      
##  Length:10038       0:9204    Min.   :  8.213   Min.   :0.04607  
##  Class :character   1: 834    1st Qu.: 26.993   1st Qu.:0.60555  
##  Mode  :character             Median : 31.833   Median :0.87474  
##                               Mean   : 31.833   Mean   :0.87474  
##                               3rd Qu.: 35.143   3rd Qu.:0.99575  
##                               Max.   :136.413   Max.   :4.61078  
##    p_totdens        f_totdens        p_seaplantdens      sodium_dens    
##  Min.   :0.1362   Min.   :0.000382   Min.   :0.003639   Min.   :0.5943  
##  1st Qu.:2.0155   1st Qu.:0.493894   1st Qu.:0.465902   1st Qu.:1.4859  
##  Median :2.4042   Median :0.831714   Median :0.801858   Median :1.6116  
##  Mean   :2.4042   Mean   :0.831714   Mean   :0.801858   Mean   :1.6116  
##  3rd Qu.:2.6193   3rd Qu.:1.007284   3rd Qu.:0.959537   3rd Qu.:1.7375  
##  Max.   :9.4185   Max.   :5.613823   Max.   :4.786033   Max.   :3.4519  
##    g_nwhldens       g_whldens        v_totdens         f_soliddens      
##  Min.   :0.1426   Min.   :0.0000   Min.   :0.006568   Min.   :0.000382  
##  1st Qu.:1.8369   1st Qu.:0.3360   1st Qu.:0.630542   1st Qu.:0.265075  
##  Median :2.2344   Median :0.6309   Median :0.954826   Median :0.574514  
##  Mean   :2.2344   Mean   :0.6309   Mean   :0.954826   Mean   :0.574514  
##  3rd Qu.:2.5424   3rd Qu.:0.7809   3rd Qu.:1.090429   3rd Qu.:0.718450  
##  Max.   :9.8700   Max.   :3.6824   Max.   :5.454516   Max.   :5.008740  
##  v_beangrendens      fatratio     heix10_sodium    momeduc  married  insurance
##  Min.   :0.0000   Min.   :0.796   Min.   : 0.000   0: 818   0:3911   1:6821   
##  1st Qu.:0.1031   1st Qu.:1.703   1st Qu.: 2.916   1:1178   1:6127   2:3036   
##  Median :0.2361   Median :1.947   Median : 4.370   2:2957            3: 181   
##  Mean   :0.2534   Mean   :1.947   Mean   : 4.370   3:5085                     
##  3rd Qu.:0.2959   3rd Qu.:2.086   3rd Qu.: 5.712                              
##  Max.   :2.9187   Max.   :4.580   Max.   :10.000                              
##  momrace4 prediab  prehtn   gravcat  v1_pregplanned smokerpre     momage     
##  1:5952   0:9891   0:9745   1:7448   1:5877         0:8256    Min.   :13.00  
##  2:1424   1: 147   1: 293   2:1916   2:4161         1:1782    1st Qu.:22.00  
##  3:1747                     3: 674                            Median :27.00  
##  4: 915                                                       Mean   :26.94  
##                                                               3rd Qu.:31.00  
##                                                               Max.   :45.00  
##    bmiprepreg       dt_kcal        
##  Min.   :11.54   Min.   :   25.12  
##  1st Qu.:21.33   1st Qu.: 1254.67  
##  Median :23.94   Median : 1691.30  
##  Mean   :25.62   Mean   : 1720.19  
##  3rd Qu.:28.06   3rd Qu.: 1866.36  
##  Max.   :72.49   Max.   :17054.34

We can use ggpairs to explore scatter plot sets:

## you may need to run this outside the md environment, bc the figure created is very large.
a %>% {
  bind_cols(
    select_if(., is.numeric),
    select_at(., "momrace4")
  )
} %>%
  GGally::ggpairs(.,
                  columns=1:16,
                  mapping = ggplot2::aes(colour=momrace4), 
                  lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1, se=F)),
                  upper = list(continuous = wrap(ggally_cor, stars=FALSE)))

ggsave("./NuMOM_pairwise-2021_10_19.tiff",width=600,height=600,units="mm",limitsize=F,dpi=300, compression = "lzw")
a%>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins=15)
## Warning: attributes are not identical across measure variables;
## they will be dropped

a%>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()
## Warning: attributes are not identical across measure variables;
## they will be dropped

a%>%
  keep(is.factor) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped

b<- a%>% dplyr::select(bmiprepreg,pct_emptyc,d_totdens,f_soliddens,heix10_sodium,fatratio,dt_kcal,momrace4,momeduc,gravcat,smokerpre)

b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=momrace4)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom race')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=momeduc)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom education')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=gravcat)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom gravidity')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=smokerpre)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom smoking status')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

c<- a%>% dplyr::select(momage,pct_emptyc,d_totdens,f_soliddens,heix10_sodium,fatratio,dt_kcal,momrace4,momeduc,gravcat,smokerpre)

c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=momrace4)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by mom race')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=momeduc)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by mom education')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=gravcat)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by gravidity')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=smokerpre)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by mom smoking status')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scale_a<-a %>%select(-numomid)%>% mutate_if(is.numeric, scale)
fviz_nbclust(scale_a,kmeans,method='wss')

fviz_nbclust(scale_a,kmeans,method='silhouette')

k2<-kmeans(scale_a,centers=2,nstart=50)
#fviz_cluster(km, geom = "point", data = scale_a) 
scale_a$clusters_k2<-k2$cluster
cluster_model_k2<-vglm(factor(clusters_k2) ~.,data=scale_a,family='multinomial')
summary(cluster_model_k2)
## 
## Call:
## vglm(formula = factor(clusters_k2) ~ ., family = "multinomial", 
##     data = scale_a)
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.33372    0.38117  -3.499 0.000467 ***
## pree_acog1       0.04783    0.16969   0.282 0.778025    
## pct_emptyc      -1.21173    0.11796 -10.273  < 2e-16 ***
## d_totdens        0.02403    0.06883   0.349 0.726971    
## p_totdens        0.51362    0.09708   5.291 1.22e-07 ***
## f_totdens        0.83268    0.10885   7.650 2.01e-14 ***
## p_seaplantdens   0.93166    0.07861  11.852  < 2e-16 ***
## sodium_dens      0.73679    0.38583   1.910 0.056186 .  
## g_nwhldens      -0.48298    0.08077  -5.980 2.23e-09 ***
## g_whldens        0.78123    0.06359  12.285  < 2e-16 ***
## v_totdens        1.17293    0.17832   6.578 4.78e-11 ***
## f_soliddens      0.96427    0.11336   8.506  < 2e-16 ***
## v_beangrendens   0.93544    0.14502   6.450 1.12e-10 ***
## fatratio         0.88192    0.08398  10.502  < 2e-16 ***
## heix10_sodium   -0.56732    0.36505  -1.554 0.120161    
## momeduc1         0.18604    0.40142   0.463 0.643041    
## momeduc2         1.51470    0.35996   4.208 2.58e-05 ***
## momeduc3         3.09217    0.37137   8.326  < 2e-16 ***
## married1         0.77749    0.13507   5.756 8.61e-09 ***
## insurance2      -0.96936    0.14216  -6.819 9.18e-12 ***
## insurance3      -1.68268    0.36004  -4.674 2.96e-06 ***
## momrace42       -0.68812    0.17700  -3.888 0.000101 ***
## momrace43       -1.13737    0.14217  -8.000 1.24e-15 ***
## momrace44       -1.53370    0.17460  -8.784  < 2e-16 ***
## prediab1        -0.30697    0.41426  -0.741 0.458692    
## prehtn1         -0.13190    0.28396  -0.465 0.642283    
## gravcat2        -0.04823    0.12231  -0.394 0.693335    
## gravcat3        -0.13971    0.19875  -0.703 0.482077    
## v1_pregplanned2 -0.63513    0.11401  -5.571 2.53e-08 ***
## smokerpre1      -0.28149    0.14222  -1.979 0.047791 *  
## momage           1.50382    0.07320  20.544  < 2e-16 ***
## bmiprepreg      -0.31963    0.05108  -6.258 3.90e-10 ***
## dt_kcal         -0.46464    0.07814  -5.947 2.74e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Name of linear predictor: log(mu[,1]/mu[,2]) 
## 
## Residual deviance: 1523.362 on 10005 degrees of freedom
## 
## Log-likelihood: -761.6808 on 10005 degrees of freedom
## 
## Number of Fisher scoring iterations: 15 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Reference group is level  2  of the response
g1<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=momrace4))+ggtitle("Cluster groups by mom race")
g2<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=momeduc))+ggtitle("Cluster groups by mom education")
g3<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=smokerpre))+ggtitle("Cluster groups by smoking status")
g4<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=gravcat))+ggtitle("Cluster groups by gravidity")
g5<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=married))+ggtitle("Cluster groups by marriage status")
g6<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=insurance))+ggtitle("Cluster groups by insurance status")
g7<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=pree_acog))+ggtitle("Cluster groups by preeclampsia")
grid.arrange(g1,g2,g3,g4,g5,g6,g7,ncol=2,nrow=4)